/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package datafu.pig.hash;

import java.util.Random;

import datafu.pig.hash.Hasher;
import com.google.common.hash.HashFunction;

import org.apache.pig.impl.logicalLayer.schema.Schema;

/**
 *
 * Computes a hash value of a string using a randomly generated seed and
 * outputs it in hex.
 *
 * This class should only be used for hashing algorithms that accept a seed
 * (murmur3-32, murmur3-128 and sip24).
 *
 * It allows you to generate a well-mixed sequence of values, unpredictable
 * for every run, without relying on the random number generator for each
 * record. The seed is generated by the front end (i.e. when you launch your
 * script) and so is identical for every task.
 *
 * @see datafu.pig.hash.Hasher
 */
public class HasherRand extends Hasher
{
  protected       HashFunction hash_func;
  protected final String       algorithm;

 /**
  * Generates hash values according to murmur3-32, a non-cryptographic-strength
  * hash function with good mixing.
  *
  * @see #HasherRand(String alg)
  */
 public HasherRand()
 {
   this("murmur3-32");
 }

  /**
   * Generates hash values according to the given hash algorithm.
   *
   * @param alg the hash algorithm to use
   * @see #HasherRand()
   */
  public HasherRand(String alg)
  {
    algorithm = alg;
  }

  /**
   * Generates the hash for a string value.
   *
   * @param val the single string to hash
   * @return val, hashed according to the algorithm specified at instantiation
   */
  @Override
  public String call(String val)
  {
    if (hash_func == null) {
      // memoize the hash func
      String rand_seed = (String)getInstanceProperties().get("rand_seed");
      super.makeHashFunc(algorithm, rand_seed);
    }
    return super.call(val);
  }

  /**
   * Generate a seed exactly once on the front end, so all workers get same value

   * @param in_schema Input schema
   * @param out_schema Output schema
   */
  @Override
  protected void onReady(Schema in_schema, Schema out_schema) {
    String rand_seed;
    Random rg = getRandomGenerator();

    if      (algorithm.equals("murmur3-32")) {
      int rand_int = rg.nextInt();
      rand_seed = String.format("%08x", rand_int);
    }
    else if (algorithm.equals("murmur3-128")){
      int rand_int = rg.nextInt();
      rand_seed = String.format("%08x", rand_int);
    }
    else if (algorithm.equals("sip24")) {
      long rand_k0  = rg.nextLong();
      long rand_k1  = rg.nextLong();
      rand_seed = String.format("%016x%016x", rand_k0, rand_k1);
    }
    else { throw new IllegalArgumentException("No hash function found for algorithm "+algorithm+" with a seed. Allowed values include "+SEEDED_HASH_NAMES); }

    getInstanceProperties().put("rand_seed", rand_seed);

    super.onReady(in_schema, out_schema);
  }

  // exists so tests can inject constant seed.
  protected Random getRandomGenerator() {
    return new Random();
  }
}
